In [1]:
!pip install geopy
!pip install tqdm
!pip install geocoder


Requirement already satisfied: geopy in /home/danielmarx/anaconda3/lib/python3.6/site-packages
Requirement already satisfied: tqdm in /home/danielmarx/anaconda3/lib/python3.6/site-packages
Requirement already satisfied: geocoder in /home/danielmarx/anaconda3/lib/python3.6/site-packages
Requirement already satisfied: click in /home/danielmarx/anaconda3/lib/python3.6/site-packages (from geocoder)
Requirement already satisfied: requests in /home/danielmarx/anaconda3/lib/python3.6/site-packages (from geocoder)
Requirement already satisfied: ratelim in /home/danielmarx/anaconda3/lib/python3.6/site-packages (from geocoder)
Requirement already satisfied: six in /home/danielmarx/anaconda3/lib/python3.6/site-packages (from geocoder)
Requirement already satisfied: decorator in /home/danielmarx/anaconda3/lib/python3.6/site-packages (from ratelim->geocoder)

In [2]:
# Import pandas
import pandas as pd
import geocoder as gc
from tqdm import tqdm
#from geopy.geocoders import Nominatim
#from geopy.distance import vincenty

# Import BoxPlot, output_notebook, and show from bokeh.charts
from bokeh.charts import BoxPlot, Donut, Bar, Histogram, output_notebook, show
from bokeh.charts.attributes import cat, color
from bokeh.charts.operations import blend
from bokeh.layouts import gridplot, row
from bokeh.models import HoverTool
from bokeh.models.widgets import Panel, Tabs
from bokeh.plotting import ColumnDataSource

In [3]:
# Assign spreadsheet filename: file
file = 'imd_student_blind.xlsx'

# Load spreadsheet: xl
xl = pd.ExcelFile(file)

# Print sheet names
print(xl.sheet_names)


['Sheet1']

In [4]:
# Load a sheet into a DataFrame by index: df
df = xl.parse(0)

# Print the head of the DataFrame df
df.head()


Out[4]:
a_ID CEP ano_ingresso periodo_ingresso status ano_disciplina periodo_disciplina nota disciplina_ID status.disciplina
0 0 59015430 2014 1 CANCELADO 2014 2 2.6 0 Reprovado
1 0 59015430 2014 1 CANCELADO 2015 1 8.0 0 Aprovado
2 1 59073120 2014 1 CANCELADO 2014 2 0.1 0 Reprovado
3 2 59072580 2014 1 ATIVO 2014 2 6.1 0 Aprovado
4 3 59088150 2014 1 ATIVO 2014 1 3.0 0 Reprovado

In [5]:
df.columns


Out[5]:
Index(['a_ID', 'CEP', 'ano_ingresso', 'periodo_ingresso', 'status',
       'ano_disciplina', 'periodo_disciplina', 'nota', 'disciplina_ID',
       'status.disciplina'],
      dtype='object')

In [6]:
df.shape


Out[6]:
(4842, 10)

In [7]:
#Cópia por valores, não mexer no DF principal
disciplinas = df.copy()

disciplinas.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 4842 entries, 0 to 4841
Data columns (total 10 columns):
a_ID                  4842 non-null int64
CEP                   4842 non-null int64
ano_ingresso          4842 non-null int64
periodo_ingresso      4842 non-null int64
status                4842 non-null object
ano_disciplina        4842 non-null int64
periodo_disciplina    4842 non-null int64
nota                  4842 non-null float64
disciplina_ID         4842 non-null int64
status.disciplina     4842 non-null object
dtypes: float64(1), int64(7), object(2)
memory usage: 416.1+ KB

In [8]:
disciplinas.columns.values.tolist()


Out[8]:
['a_ID',
 'CEP',
 'ano_ingresso',
 'periodo_ingresso',
 'status',
 'ano_disciplina',
 'periodo_disciplina',
 'nota',
 'disciplina_ID',
 'status.disciplina']

In [9]:
#Retira informações desnecessárias para essa análise

disciplinas.drop(disciplinas.columns[0:4], axis=1, inplace=True)

In [10]:
#Objetivo 1 -> Média da disciplina X por período e ano
#Objetivo 2 -> Taxa de trancamento e cancelamento por período e ano
disciplinas.columns.values.tolist()


Out[10]:
['status',
 'ano_disciplina',
 'periodo_disciplina',
 'nota',
 'disciplina_ID',
 'status.disciplina']

In [11]:
disciplinas = disciplinas[disciplinas['status'] == 'ATIVO']
disciplinas.head()


Out[11]:
status ano_disciplina periodo_disciplina nota disciplina_ID status.disciplina
3 ATIVO 2014 2 6.1 0 Aprovado
4 ATIVO 2014 1 3.0 0 Reprovado
5 ATIVO 2014 2 7.2 0 Aprovado
8 ATIVO 2014 2 8.9 0 Aprovado
12 ATIVO 2014 2 5.6 0 Aprovado

In [12]:
disciplinas_categorias =  disciplinas.groupby(['ano_disciplina', 'periodo_disciplina', 'disciplina_ID']).mean()

# convert the index to a column
disciplinas_categorias.reset_index(inplace = True )
disciplinas_categorias.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 4 columns):
ano_disciplina        42 non-null int64
periodo_disciplina    42 non-null int64
disciplina_ID         42 non-null int64
nota                  42 non-null float64
dtypes: float64(1), int64(3)
memory usage: 1.4 KB

In [13]:
disciplinas_categorias[disciplinas_categorias['disciplina_ID'] == 0]


Out[13]:
ano_disciplina periodo_disciplina disciplina_ID nota
0 2014 1 0 5.872727
6 2014 2 0 5.625676
13 2015 1 0 6.126316
19 2015 2 0 5.486555
26 2016 1 0 4.139394
33 2016 2 0 5.750758

In [14]:
# Make a box plot: p
p = BoxPlot(disciplinas_categorias, values='nota', label='disciplina_ID', color='disciplina_ID',
             title='Distribuição da média de notas das disciplinas',
             legend='bottom_right')

# Set the y axis label
p.yaxis.axis_label='Média por período'
p.add_tools(HoverTool(tooltips=[("Nota", "$y")]))

# Call the output_notebook() 
output_notebook()
show(p)


Loading BokehJS ...

In [15]:
ps = []
#tbs = []

for atual in range(0,7):
    p = BoxPlot(disciplinas_categorias[disciplinas_categorias['disciplina_ID'] == atual], values='nota', label='periodo_disciplina',
                        color='periodo_disciplina', title='Ddisciplina '+str(atual), legend='bottom_right')
    p.yaxis.axis_label='Média por período'
    p.add_tools(HoverTool(tooltips=[("Nota", "$y")]))
    
    #temp = Panel(child=p, title='Ddisciplina '+str(atual))
    
    #tbs.append(temp)
    ps.append(p)
    
# Make a box plot: p


# Set the y axis label

grid = gridplot([ps[0:2], ps[2:4], ps[4:6], [None, ps[6]]], sizing_mode='scale_width')
#grid = gridplot([[ps[0], ps[1]], [ps[2], ps[3]], [ps[4], ps[5]], [None, ps[6]]], sizing_mode='scale_width')

#tabs = Tabs(tabs=[ tbs[0], tbs[5] ],sizing_mode='scale_width' )

#show(tabs)
# show the results
show(grid)



In [16]:
##Quantidade de alunos ativos/Trancados/cancelados/...
count_series = pd.DataFrame(columns=('Status', 'Count'))
count_series["Status"] = df["status"].unique().tolist()
tam = len(df)

for atual in range(0,len(count_series)):
    count_series.iloc[atual]["Count"] = df[df["status"] == count_series.iloc[atual]["Status"]]["status"].count()

def porcent(val):
    p = (val*100)/tam
    return p
    
graph = Donut(count_series, label="Status", values="Count", title="Situação dos alunos" )
graph.add_tools(HoverTool(tooltips=[("Alunos", str(count_series.iloc[1]["Count"])+"/"+str(tam)), #falta trocar o 1 pelo número de alunos da categoria
                                    ("Porcentagem", str(porcent(1000))) #falta trocar o 1000 pelo numero de alunos da categoria
                                   ]
                         ))

show(graph)



In [17]:
###Verificar desistência
def popula(string):
    aux = []
    for atual in range (2014, 2017):
        aux.append(df[df["ano_ingresso"] == atual][df["status"] == string]["status"].count())
    return aux

count_desist = pd.DataFrame(columns=('ANO', 'CANCELADO', 'ATIVO', 'TRANCADO', 'CONCLUIDO', 'FORMANDO', 'FORMADO'))
count_desist["ANO"] = [2014, 2015, 2016]

for atual in count_desist.columns.values:
    if atual != "ANO":
        count_desist[atual] = popula(atual)

print(count_desist)


    ANO  CANCELADO  ATIVO  TRANCADO  CONCLUIDO  FORMANDO  FORMADO
0  2014        584    843       112         13        38       10
1  2015        329   1530       114          7         0        0
2  2016        134   1067        56          0         5        0
/home/danielmarx/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:5: UserWarning: Boolean Series key will be reindexed to match DataFrame index.

In [18]:
BLEND = blend('CANCELADO', "ATIVO",  "TRANCADO",  "CONCLUIDO",  "FORMANDO",  "FORMADO",
              name='unidade', labels_name='Situacao')

bar = Bar(count_desist, values=BLEND,
          label=cat(columns='ANO', sort=False),
          stack=cat(columns='Situacao', sort=False),
          color="Situacao",
          legend='bottom_center',
          title="Índice de formação e desitencia de alunos por ano",
          ylabel = "Alunos",
          tooltips=[('Tipo', "$y")]) ##Corrigir isso também

output_notebook()

show(bar)


Loading BokehJS ...

In [19]:
###Verificar NOtas ENEM
ps = []
temp = ["nota", "enen-nota", "enem-matematica", "enem-redacao"]

for atual in range(0, len(temp)):
    p = BoxPlot(disciplinas_categorias, values=temp[atual], label='disciplina_ID',
                        color='disciplina_ID', title=str(temp[atual])+" x Desempenho em disciplinas ", legend='bottom_right')
    p.yaxis.axis_label='Média por período'
    p.add_tools(HoverTool(tooltips=[("Nota", "$y{f.fff}")]))
    
    #temp = Panel(child=p, title='Ddisciplina '+str(atual))
    
    #tbs.append(temp)
    ps.append(p)
    

grid = gridplot([ps[0:2], ps[2:4]], sizing_mode='scale_width')

output_notebook()
show(grid)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/home/danielmarx/anaconda3/lib/python3.6/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2133             try:
-> 2134                 return self._engine.get_loc(key)
   2135             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'enen-nota'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-19-d2e189cde1c5> in <module>()
      5 for atual in range(0, len(temp)):
      6     p = BoxPlot(disciplinas_categorias, values=temp[atual], label='disciplina_ID',
----> 7                         color='disciplina_ID', title=str(temp[atual])+" x Desempenho em disciplinas ", legend='bottom_right')
      8     p.yaxis.axis_label='Média por período'
      9     p.add_tools(HoverTool(tooltips=[("Nota", "$y{f.fff}")]))

/home/danielmarx/anaconda3/lib/python3.6/site-packages/bokeh/charts/builders/boxplot_builder.py in BoxPlot(data, label, values, color, group, xscale, yscale, xgrid, ygrid, continuous_range, **kw)
    108     kw['y_range'] = y_range
    109 
--> 110     return create_and_build(BoxPlotBuilder, data, **kw)
    111 
    112 

/home/danielmarx/anaconda3/lib/python3.6/site-packages/bokeh/charts/builder.py in create_and_build(builder_class, *data, **kws)
     55     chart_kws = {k: v for k, v in kws.items() if k not in builder_props}
     56     chart = Chart(**chart_kws)
---> 57     chart.add_builder(builder)
     58     chart.start_plot()
     59 

/home/danielmarx/anaconda3/lib/python3.6/site-packages/bokeh/charts/chart.py in add_builder(self, builder)
    151     def add_builder(self, builder):
    152         self._builders.append(builder)
--> 153         builder.create(self)
    154 
    155     def add_ranges(self, dim, range):

/home/danielmarx/anaconda3/lib/python3.6/site-packages/bokeh/charts/builder.py in create(self, chart)
    519         if chart is None:
    520             chart = Chart()
--> 521         chart.add_renderers(self, renderers)
    522 
    523         # handle ranges after renders, since ranges depend on aggregations

/home/danielmarx/anaconda3/lib/python3.6/site-packages/bokeh/charts/chart.py in add_renderers(self, builder, renderers)
    146 
    147     def add_renderers(self, builder, renderers):
--> 148         self.renderers += renderers
    149         self._renderer_map.extend({ r._id : builder for r in renderers })
    150 

/home/danielmarx/anaconda3/lib/python3.6/site-packages/bokeh/core/property/containers.py in wrapper(self, *args, **kwargs)
     74     def wrapper(self, *args, **kwargs):
     75         old = self._saved_copy()
---> 76         result = func(self, *args, **kwargs)
     77         self._notify_owners(old)
     78         return result

/home/danielmarx/anaconda3/lib/python3.6/site-packages/bokeh/core/property/containers.py in __iadd__(self, y)
    173     @notify_owner
    174     def __iadd__(self, y):
--> 175         return super(PropertyValueList, self).__iadd__(y)
    176 
    177     # x *= y

/home/danielmarx/anaconda3/lib/python3.6/site-packages/bokeh/charts/builders/bar_builder.py in yield_renderers(self)
    212             bg = self.glyph(label=group.label,
    213                             x_label=self._get_label(group['label']),
--> 214                             values=group.data[self.values.selection].values,
    215                             agg=stats[self.agg](),
    216                             width=self.bar_width,

/home/danielmarx/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/home/danielmarx/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/home/danielmarx/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/home/danielmarx/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3541 
   3542             if not isnull(item):
-> 3543                 loc = self.items.get_loc(item)
   3544             else:
   3545                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/danielmarx/anaconda3/lib/python3.6/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'enen-nota'

In [20]:
#Verificar distancias
distancias = df[df["status"] == "ATIVO"].copy()
distancias["LAT"], distancias["LON"], distancias["KM"] = [0,0,0]
distancias = distancias.reset_index()
uf = gc.google("59064741").latlng

for i in tqdm(range(0,100)): ## trocar o range por range(len(distancias))
    st = distancias.loc[i,'CEP']
    g = gc.google(st)
    if g.lat == None:
        distancias.loc[i, "LAT"] = 0
    elif g.lng == None:
        distancias.loc[i, "LON"] = 0
    else:
        distancias.loc[i, "LON"] = g.lng
        distancias.loc[i, "LAT"] = g.lat
print("Completo")

distancias.to_csv('LatLong_Alunos.csv', encoding="utf-8")
##Salva tabela criada
    
#UFRN 59064-741
distancias.head()


  5%|▌         | 5/100 [00:05<01:41,  1.06s/it]
---------------------------------------------------------------------------
WantReadError                             Traceback (most recent call last)
/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/contrib/pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    416             try:
--> 417                 cnx.do_handshake()
    418             except OpenSSL.SSL.WantReadError:

/home/danielmarx/anaconda3/lib/python3.6/site-packages/OpenSSL/SSL.py in do_handshake(self)
   1425         result = _lib.SSL_do_handshake(self._ssl)
-> 1426         self._raise_ssl_error(self._ssl, result)
   1427 

/home/danielmarx/anaconda3/lib/python3.6/site-packages/OpenSSL/SSL.py in _raise_ssl_error(self, ssl, result)
   1148         if error == _lib.SSL_ERROR_WANT_READ:
-> 1149             raise WantReadError()
   1150         elif error == _lib.SSL_ERROR_WANT_WRITE:

WantReadError: 

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-20-6b1aff7a023d> in <module>()
      7 for i in tqdm(range(0,100)): ## trocar o range por range(len(distancias))
      8     st = distancias.loc[i,'CEP']
----> 9     g = gc.google(st)
     10     if g.lat == None:
     11         distancias.loc[i, "LAT"] = 0

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/api.py in google(location, **kwargs)
    184         > elevation
    185     """
--> 186     return get(location, provider='google', **kwargs)
    187 
    188 

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/api.py in get(location, **kwargs)
    150         if method not in options[provider]:
    151             raise ValueError("Invalid method")
--> 152     return options[provider][method](location, **kwargs)
    153 
    154 

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/google.py in __init__(self, location, **kwargs)
     52         elif kwargs.get('key', google_key):
     53             self.params['key'] = kwargs.get('key', google_key)
---> 54         self._initialize(**kwargs)
     55 
     56     def _location_init(self, location, **kwargs):

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/base.py in _initialize(self, **kwargs)
    124         self.encoding = kwargs.get('encoding', 'utf-8')
    125         self.session = kwargs.get('session', requests.Session())
--> 126         self._connect(url=self.url, **kwargs)
    127         ###
    128         try:

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/base.py in _connect(self, **kwargs)
     94                 headers=self.headers,
     95                 timeout=self.timeout,
---> 96                 proxies=self.proxies
     97             )
     98             self.status_code = r.status_code

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/base.py in rate_limited_get(self, url, **kwargs)
     71 
     72     def rate_limited_get(self, url, **kwargs):
---> 73         return self.session.get(url, **kwargs)
     74 
     75     @staticmethod

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/sessions.py in get(self, url, **kwargs)
    499 
    500         kwargs.setdefault('allow_redirects', True)
--> 501         return self.request('GET', url, **kwargs)
    502 
    503     def options(self, url, **kwargs):

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    486         }
    487         send_kwargs.update(settings)
--> 488         resp = self.send(prep, **send_kwargs)
    489 
    490         return resp

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
    607 
    608         # Send the request
--> 609         r = adapter.send(request, **kwargs)
    610 
    611         # Total elapsed time of the request (approximately)

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    421                     decode_content=False,
    422                     retries=self.max_retries,
--> 423                     timeout=timeout
    424                 )
    425 

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)
    592                                                   timeout=timeout_obj,
    593                                                   body=body, headers=headers,
--> 594                                                   chunked=chunked)
    595 
    596             # If we're going to release the connection in ``finally:``, then

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    348         # Trigger any extra validation we need to do.
    349         try:
--> 350             self._validate_conn(conn)
    351         except (SocketTimeout, BaseSSLError) as e:
    352             # Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/connectionpool.py in _validate_conn(self, conn)
    833         # Force connect early to allow us to validate the connection.
    834         if not getattr(conn, 'sock', None):  # AppEngine might not have  `.sock`
--> 835             conn.connect()
    836 
    837         if not conn.is_verified:

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/connection.py in connect(self)
    321             ca_cert_dir=self.ca_cert_dir,
    322             server_hostname=hostname,
--> 323             ssl_context=context)
    324 
    325         if self.assert_fingerprint:

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/util/ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)
    322         context.load_cert_chain(certfile, keyfile)
    323     if HAS_SNI:  # Platform-specific: OpenSSL with enabled SNI
--> 324         return context.wrap_socket(sock, server_hostname=server_hostname)
    325 
    326     warnings.warn(

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/contrib/pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    417                 cnx.do_handshake()
    418             except OpenSSL.SSL.WantReadError:
--> 419                 rd, _, _ = select.select([sock], [], [], sock.gettimeout())
    420                 if not rd:
    421                     raise timeout('select timed out')

KeyboardInterrupt: 

In [59]:
distancias =  pd.read_csv('LatLong_Alunos.csv', encoding="utf-8", index_col=0)
    
for atual in tqdm(range(0,100)): ## trocar o range por range(len(distancias))
    lt = distancias.loc[atual, "LAT"]
    ln = distancias.loc[atual, "LON"]
    if lt != 0. and ln != 0.:
        compare = (lt, ln)
        #print(vincenty(uf, compare).km)
        distancias.loc[atual, "KM"] = vincenty(uf, compare).km
    
distancias.head()


100%|██████████████████████████████████████| 100/100 [00:00<00:00, 1080.22it/s]
Out[59]:
index a_ID CEP ano_ingresso periodo_ingresso status ano_disciplina periodo_disciplina nota disciplina_ID status.disciplina LAT LON KM
0 3 2 59072580 2014 1 ATIVO 2014 2 6.1 0 Aprovado 0 0 0
1 4 3 59088150 2014 1 ATIVO 2014 1 3.0 0 Reprovado 0 0 0
2 5 3 59088150 2014 1 ATIVO 2014 2 7.2 0 Aprovado 0 0 0
3 8 5 59112430 2014 1 ATIVO 2014 2 8.9 0 Aprovado 0 0 0
4 12 7 59020330 2014 1 ATIVO 2014 2 5.6 0 Aprovado 0 0 0

In [60]:
def binSearch(array, ano, periodo, element_to_search):
    for index in range(len(array)):
        if((periodo == array[index]['PERIODO'] and ano == array[index]['ANO']) and element_to_search == array[index]['DISCIPLINA']):
            return index
    return -1

In [61]:
import copy

def catchTRA(array, tra):
    dict_model = {'ANO': '', 'PERIODO': '', 'DISCIPLINA': 0, 'NUMERO_DE_ALUNOS': 0, 'PORCENTAGEM': 0}
    anos=  [2014, 2015, 2016]
    periodos = [1, 2]
    for ano in anos:
        for periodo in periodos:
            temp_df1 = df[df['status.disciplina'] == tra]
            temp_df2 = temp_df1[temp_df1['periodo_disciplina'] == periodo]
            temp_df3 = temp_df2[temp_df2['ano_disciplina'] == ano]
            #print(temp_df2)
            for index in range(len(temp_df3)):
                disciplina = temp_df3.iloc[index]['disciplina_ID']
            
                bin_result = binSearch(array, ano, periodo, disciplina)
                if bin_result == -1:
                    temp_dict = copy.copy(dict_model)
                    temp_dict['ANO'] = ano
                    temp_dict['PERIODO'] = periodo
                    temp_dict['DISCIPLINA'] = disciplina
                    temp_dict['NUMERO_DE_ALUNOS'] = 1
                    array.append(temp_dict)
                else:
                    array[bin_result]['NUMERO_DE_ALUNOS'] += 1
    
    for index in range(len(array)):
        index_total_estudantes = df[df['ano_disciplina'] == array[index]['ANO']]
        index_total_estudantes = index_total_estudantes[index_total_estudantes['periodo_disciplina'] == array[index]['PERIODO']]
        index_total_estudantes = index_total_estudantes[index_total_estudantes['disciplina_ID'] == array[index]['DISCIPLINA']]
        index_total_estudantes = len(index_total_estudantes)
    
        array[index]['PORCENTAGEM'] = (array[index]['NUMERO_DE_ALUNOS']/index_total_estudantes) * 100

In [62]:
reprovados = []

catchTRA(reprovados, 'Reprovado')

In [63]:
aprovados = []

catchTRA(aprovados, 'Aprovado')

Graficos de aprovados e reprovados


In [64]:
import numpy as np
import scipy.special

from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file

def cria_graficos_barras_apro_repro(array, disciplina, titulo_grafico):
    dados = []
    
    anos=  [2014, 2015, 2016]
    periodos = [1, 2]
    
    for ano in anos:
        for periodo in periodos:
            for index in range(len(array)):
                if (array[index]['ANO'] == ano and array[index]['PERIODO'] == periodo) and array[index]['DISCIPLINA'] == disciplina:  
                    dados.append(array[index]['PORCENTAGEM'])   

    data = {
    'semestres': ['2014.1', '2014.2', '2015.1', '2015.2', '2016.1', '2016.2'],
    'porcentage': dados
}

    # table-like data results in reconfiguration of the chart with no data manipulation
    bar2 = Bar(data, values='porcentage', label=['semestres'], title=titulo_grafico, plot_width=400)

    output_file("stacked_bar.html")
    show(row(bar2))

In [65]:
cria_graficos_barras_apro_repro(aprovados, 0, 'Aprovados disciplina 0')
cria_graficos_barras_apro_repro(aprovados, 1, 'Aprovados disciplina 1')
cria_graficos_barras_apro_repro(aprovados, 2, 'Aprovados disciplina 2')
cria_graficos_barras_apro_repro(aprovados, 3, 'Aprovados disciplina 3')
cria_graficos_barras_apro_repro(aprovados, 5, 'Aprovados disciplina 5')
cria_graficos_barras_apro_repro(aprovados, 6, 'Aprovados disciplina 6')


INFO:bokeh.core.state:Session output file 'stacked_bar.html' already exists, will be overwritten.
INFO:bokeh.core.state:Session output file 'stacked_bar.html' already exists, will be overwritten.
INFO:bokeh.core.state:Session output file 'stacked_bar.html' already exists, will be overwritten.
INFO:bokeh.core.state:Session output file 'stacked_bar.html' already exists, will be overwritten.
INFO:bokeh.core.state:Session output file 'stacked_bar.html' already exists, will be overwritten.

In [66]:
cria_graficos_barras_apro_repro(reprovados, 0, 'Reprovados disciplina 0')
cria_graficos_barras_apro_repro(reprovados, 1, 'Reprovados disciplina 1')
cria_graficos_barras_apro_repro(reprovados, 2, 'Reprovados disciplina 2')
cria_graficos_barras_apro_repro(reprovados, 3, 'Reprovados disciplina 3')
cria_graficos_barras_apro_repro(reprovados, 5, 'Reprovados disciplina 5')
cria_graficos_barras_apro_repro(reprovados, 6, 'Reprovados disciplina 6')


INFO:bokeh.core.state:Session output file 'stacked_bar.html' already exists, will be overwritten.
INFO:bokeh.core.state:Session output file 'stacked_bar.html' already exists, will be overwritten.
INFO:bokeh.core.state:Session output file 'stacked_bar.html' already exists, will be overwritten.
INFO:bokeh.core.state:Session output file 'stacked_bar.html' already exists, will be overwritten.
INFO:bokeh.core.state:Session output file 'stacked_bar.html' already exists, will be overwritten.
INFO:bokeh.core.state:Session output file 'stacked_bar.html' already exists, will be overwritten.

Preparação para determinar Correlação entre média de notas e distância até a UF


In [21]:
# Import pandas
import pandas as pd
import geocoder as gc
from tqdm import tqdm
from geopy.geocoders import Nominatim
from geopy.distance import vincenty

# Import BoxPlot, output_notebook, and show from bokeh.charts
from bokeh.charts import BoxPlot, Donut, Bar, Histogram, output_notebook, show
from bokeh.charts.attributes import cat, color
from bokeh.charts.operations import blend
from bokeh.layouts import gridplot, row
from bokeh.models import HoverTool
from bokeh.models.widgets import Panel, Tabs
from bokeh.plotting import ColumnDataSource

In [22]:
#Verificar distancias
distancias = df[df["status"] == "ATIVO"].copy()
distancias["LAT"], distancias["LON"], distancias["KM"] = [0,0,0]
distancias = distancias.reset_index()
uf = gc.google("59064741").latlng

for i in tqdm(range(len(distancias))): ## trocar o range por range(len(distancias))
    st = distancias.loc[i,'CEP']
    g = gc.google(st)
    if g.lat == None:
        distancias.loc[i, "LAT"] = 0
    elif g.lng == None:
        distancias.loc[i, "LON"] = 0
    else:
        distancias.loc[i, "LON"] = g.lng
        distancias.loc[i, "LAT"] = g.lat
print("Completo")

distancias.to_csv('LatLong_Alunos3.csv', encoding="utf-8")
##Salva tabela criada
    
#UFRN 59064-741
distancias.head()


  0%|          | 5/3440 [00:03<43:16,  1.32it/s]
---------------------------------------------------------------------------
WantReadError                             Traceback (most recent call last)
/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/contrib/pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    416             try:
--> 417                 cnx.do_handshake()
    418             except OpenSSL.SSL.WantReadError:

/home/danielmarx/anaconda3/lib/python3.6/site-packages/OpenSSL/SSL.py in do_handshake(self)
   1425         result = _lib.SSL_do_handshake(self._ssl)
-> 1426         self._raise_ssl_error(self._ssl, result)
   1427 

/home/danielmarx/anaconda3/lib/python3.6/site-packages/OpenSSL/SSL.py in _raise_ssl_error(self, ssl, result)
   1148         if error == _lib.SSL_ERROR_WANT_READ:
-> 1149             raise WantReadError()
   1150         elif error == _lib.SSL_ERROR_WANT_WRITE:

WantReadError: 

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-22-6438bacda0dc> in <module>()
      7 for i in tqdm(range(len(distancias))): ## trocar o range por range(len(distancias))
      8     st = distancias.loc[i,'CEP']
----> 9     g = gc.google(st)
     10     if g.lat == None:
     11         distancias.loc[i, "LAT"] = 0

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/api.py in google(location, **kwargs)
    184         > elevation
    185     """
--> 186     return get(location, provider='google', **kwargs)
    187 
    188 

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/api.py in get(location, **kwargs)
    150         if method not in options[provider]:
    151             raise ValueError("Invalid method")
--> 152     return options[provider][method](location, **kwargs)
    153 
    154 

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/google.py in __init__(self, location, **kwargs)
     52         elif kwargs.get('key', google_key):
     53             self.params['key'] = kwargs.get('key', google_key)
---> 54         self._initialize(**kwargs)
     55 
     56     def _location_init(self, location, **kwargs):

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/base.py in _initialize(self, **kwargs)
    124         self.encoding = kwargs.get('encoding', 'utf-8')
    125         self.session = kwargs.get('session', requests.Session())
--> 126         self._connect(url=self.url, **kwargs)
    127         ###
    128         try:

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/base.py in _connect(self, **kwargs)
     94                 headers=self.headers,
     95                 timeout=self.timeout,
---> 96                 proxies=self.proxies
     97             )
     98             self.status_code = r.status_code

/home/danielmarx/anaconda3/lib/python3.6/site-packages/geocoder/base.py in rate_limited_get(self, url, **kwargs)
     71 
     72     def rate_limited_get(self, url, **kwargs):
---> 73         return self.session.get(url, **kwargs)
     74 
     75     @staticmethod

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/sessions.py in get(self, url, **kwargs)
    499 
    500         kwargs.setdefault('allow_redirects', True)
--> 501         return self.request('GET', url, **kwargs)
    502 
    503     def options(self, url, **kwargs):

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    486         }
    487         send_kwargs.update(settings)
--> 488         resp = self.send(prep, **send_kwargs)
    489 
    490         return resp

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
    607 
    608         # Send the request
--> 609         r = adapter.send(request, **kwargs)
    610 
    611         # Total elapsed time of the request (approximately)

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    421                     decode_content=False,
    422                     retries=self.max_retries,
--> 423                     timeout=timeout
    424                 )
    425 

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)
    592                                                   timeout=timeout_obj,
    593                                                   body=body, headers=headers,
--> 594                                                   chunked=chunked)
    595 
    596             # If we're going to release the connection in ``finally:``, then

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    348         # Trigger any extra validation we need to do.
    349         try:
--> 350             self._validate_conn(conn)
    351         except (SocketTimeout, BaseSSLError) as e:
    352             # Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/connectionpool.py in _validate_conn(self, conn)
    833         # Force connect early to allow us to validate the connection.
    834         if not getattr(conn, 'sock', None):  # AppEngine might not have  `.sock`
--> 835             conn.connect()
    836 
    837         if not conn.is_verified:

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/connection.py in connect(self)
    321             ca_cert_dir=self.ca_cert_dir,
    322             server_hostname=hostname,
--> 323             ssl_context=context)
    324 
    325         if self.assert_fingerprint:

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/util/ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)
    322         context.load_cert_chain(certfile, keyfile)
    323     if HAS_SNI:  # Platform-specific: OpenSSL with enabled SNI
--> 324         return context.wrap_socket(sock, server_hostname=server_hostname)
    325 
    326     warnings.warn(

/home/danielmarx/anaconda3/lib/python3.6/site-packages/requests/packages/urllib3/contrib/pyopenssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    417                 cnx.do_handshake()
    418             except OpenSSL.SSL.WantReadError:
--> 419                 rd, _, _ = select.select([sock], [], [], sock.gettimeout())
    420                 if not rd:
    421                     raise timeout('select timed out')

KeyboardInterrupt: 

In [38]:
distancias.reset_index()


Out[38]:
level_0 index a_ID CEP ano_ingresso periodo_ingresso status ano_disciplina periodo_disciplina nota disciplina_ID status.disciplina LAT LON KM
0 0 3 2 59072580 2014 1 ATIVO 2014 2 6.1 0 Aprovado -5.832998 -35.242542 3.451075
1 1 4 3 59088150 2014 1 ATIVO 2014 1 3.0 0 Reprovado -5.872282 -35.206600 4.027258
2 2 5 3 59088150 2014 1 ATIVO 2014 2 7.2 0 Aprovado -5.872282 -35.206600 4.027258
3 3 8 5 59112430 2014 1 ATIVO 2014 2 8.9 0 Aprovado -5.750546 -35.258952 10.830276
4 4 12 7 59020330 2014 1 ATIVO 2014 2 5.6 0 Aprovado -5.798325 -35.202731 4.301033
5 5 13 8 59151610 2014 1 ATIVO 2014 2 1.5 0 Reprovado 0.000000 0.000000 0.000000
6 6 14 8 59151610 2014 1 ATIVO 2015 1 6.9 0 Aprovado 0.000000 0.000000 0.000000
7 7 21 12 59547000 2014 1 ATIVO 2014 2 2.6 0 Reprovado -5.516642 -36.067595 101.201317
8 8 22 12 59547000 2014 1 ATIVO 2015 1 5.9 0 Aprovado -5.516642 -36.067595 101.201317
9 9 25 14 59150500 2014 1 ATIVO 2014 2 1.2 0 Reprovado 0.000000 0.000000 0.000000
10 10 26 14 59150500 2014 1 ATIVO 2015 1 4.3 0 Reprovado 0.000000 0.000000 0.000000
11 11 27 14 59150500 2014 1 ATIVO 2015 2 7.7 0 Aprovado 0.000000 0.000000 0.000000
12 12 28 15 59151800 2014 1 ATIVO 2014 2 8.5 0 Aprovado -5.890733 -35.196979 6.242204
13 13 29 16 59810000 2014 1 ATIVO 2014 2 9.8 0 Aprovado -6.028499 -37.999640 309.449313
14 14 36 20 59090620 2014 1 ATIVO 2014 2 6.4 0 Aprovado -5.885479 -35.168988 7.204428
15 15 37 21 59324000 2014 1 ATIVO 2014 2 7.5 0 Aprovado -6.330924 -37.275809 234.964516
16 16 39 23 59070750 2014 2 ATIVO 2015 1 8.9 0 Aprovado -5.833169 -35.231361 2.220069
17 17 41 25 59115685 2014 1 ATIVO 2016 1 7.3 0 Aprovado -5.744671 -35.271933 12.132286
18 18 42 26 59290000 2014 1 ATIVO 2014 2 1.1 0 Reprovado -5.776116 -35.368568 18.616898
19 19 43 26 59290000 2014 1 ATIVO 2015 1 0.3 0 Reprovado -5.776116 -35.368568 18.616898
20 20 44 26 59290000 2014 1 ATIVO 2015 2 6.6 0 Aprovado -5.776116 -35.368568 18.616898
21 21 45 27 59040240 2014 1 ATIVO 2014 1 5.9 0 Aprovado -5.793517 -35.223136 4.891914
22 22 49 30 59094300 2014 1 ATIVO 2014 2 6.8 0 Aprovado -5.871294 -35.184497 4.902081
23 23 50 31 59020400 2014 1 ATIVO 2014 2 8.4 0 Aprovado -5.787405 -35.201232 5.515917
24 24 52 33 59122495 2014 1 ATIVO 2014 2 7.5 0 Aprovado -5.750008 -35.225788 9.661615
25 25 53 34 51011480 2014 1 ATIVO 2014 2 0.2 0 Reprovado -8.098336 -34.886179 252.740994
26 26 54 34 51011480 2014 1 ATIVO 2015 1 5.3 0 Aprovado -8.098336 -34.886179 252.740994
27 27 58 37 59037250 2014 1 ATIVO 2014 2 0.3 0 Reprovado -5.804820 -35.231433 4.110470
28 28 59 37 59037250 2014 1 ATIVO 2015 2 0.7 0 Reprovado -5.804820 -35.231433 4.110470
29 29 60 37 59037250 2014 1 ATIVO 2016 1 0.0 0 Reprovado -5.804820 -35.231433 4.110470
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3213 3407 4804 632 59628550 2016 1 ATIVO 2016 2 5.8 6 Aprovado 0.000000 0.000000 0.000000
3214 3408 4805 633 59022530 2016 1 ATIVO 2016 2 8.5 6 Aprovado 0.000000 0.000000 0.000000
3215 3411 4808 635 59700000 2016 1 ATIVO 2016 2 6.5 6 Aprovado 0.000000 0.000000 0.000000
3216 3412 4809 636 59060215 2016 1 ATIVO 2016 2 0.0 6 Reprovado 0.000000 0.000000 0.000000
3217 3414 4811 638 59062000 2016 1 ATIVO 2016 2 7.8 6 Aprovado 0.000000 0.000000 0.000000
3218 3415 4812 639 59158150 2016 1 ATIVO 2016 2 5.9 6 Aprovado 0.000000 0.000000 0.000000
3219 3416 4815 644 59080115 2016 1 ATIVO 2016 1 7.1 6 Aprovado 0.000000 0.000000 0.000000
3220 3417 4816 645 59360000 2016 1 ATIVO 2016 2 8.5 6 Aprovado 0.000000 0.000000 0.000000
3221 3418 4817 647 59290000 2016 1 ATIVO 2016 2 5.1 6 Aprovado 0.000000 0.000000 0.000000
3222 3419 4818 648 5021000 2016 1 ATIVO 2016 2 1.4 6 Reprovado 0.000000 0.000000 0.000000
3223 3420 4819 649 59088460 2016 1 ATIVO 2016 1 5.5 6 Aprovado 0.000000 0.000000 0.000000
3224 3421 4820 650 59149403 2016 1 ATIVO 2016 2 7.9 6 Aprovado 0.000000 0.000000 0.000000
3225 3422 4822 652 35700416 2016 1 ATIVO 2016 2 7.8 6 Aprovado 0.000000 0.000000 0.000000
3226 3423 4823 653 59054375 2016 1 ATIVO 2016 2 1.4 6 Reprovado 0.000000 0.000000 0.000000
3227 3424 4824 655 59150600 2016 1 ATIVO 2016 2 7.0 6 Aprovado 0.000000 0.000000 0.000000
3228 3425 4825 656 59730000 2016 1 ATIVO 2016 2 6.7 6 Aprovado 0.000000 0.000000 0.000000
3229 3426 4826 657 59020030 2016 1 ATIVO 2016 2 1.0 6 Reprovado 0.000000 0.000000 0.000000
3230 3427 4828 660 59080115 2016 1 ATIVO 2016 2 7.3 6 Aprovado 0.000000 0.000000 0.000000
3231 3428 4829 661 59152600 2016 1 ATIVO 2016 2 0.0 6 Reprovado 0.000000 0.000000 0.000000
3232 3429 4830 662 59136160 2016 1 ATIVO 2016 2 6.8 6 Aprovado 0.000000 0.000000 0.000000
3233 3430 4831 663 59074420 2016 1 ATIVO 2016 2 2.3 6 Reprovado 0.000000 0.000000 0.000000
3234 3431 4832 664 59149150 2016 1 ATIVO 2016 2 7.4 6 Aprovado 0.000000 0.000000 0.000000
3235 3432 4833 665 59133305 2016 1 ATIVO 2016 2 2.8 6 Reprovado 0.000000 0.000000 0.000000
3236 3433 4835 667 59080100 2016 1 ATIVO 2016 2 0.3 6 Reprovado 0.000000 0.000000 0.000000
3237 3434 4836 668 59071080 2016 1 ATIVO 2016 2 5.2 6 Aprovado 0.000000 0.000000 0.000000
3238 3435 4837 669 59151550 2016 1 ATIVO 2016 2 1.8 6 Reprovado 0.000000 0.000000 0.000000
3239 3436 4838 670 18150000 2016 1 ATIVO 2016 2 6.2 6 Aprovado 0.000000 0.000000 0.000000
3240 3437 4839 856 59076700 2016 1 ATIVO 2016 2 9.6 6 Aprovado 0.000000 0.000000 0.000000
3241 3438 4840 672 59031120 2016 1 ATIVO 2016 2 1.7 6 Reprovado 0.000000 0.000000 0.000000
3242 3439 4841 870 59280000 2016 1 ATIVO 2016 2 0.0 6 Reprovado 0.000000 0.000000 0.000000

3243 rows × 15 columns


In [71]:
distancias =  pd.read_csv('LatLong_Alunos3.csv', encoding="utf-8", index_col=0)

for atual in tqdm(range(len(distancias))): ## trocar o range por range(len(distancias))
    lt = distancias.loc[atual, "LAT"]
    ln = distancias.loc[atual, "LON"]
    if lt != 0. and ln != 0.:
        compare = (lt, ln)
        #print(vincenty(uf, compare).km)
        distancias.loc[atual, "KM"] = vincenty(uf, compare).km
        
distancias.to_csv('LatLong_Alunos.csv', encoding="utf-8")    
distancias.head()


100%|██████████| 3440/3440 [00:04<00:00, 775.48it/s] 
Out[71]:
index a_ID CEP ano_ingresso periodo_ingresso status ano_disciplina periodo_disciplina nota disciplina_ID status.disciplina LAT LON KM
0 3 2 59072580 2014 1 ATIVO 2014 2 6.1 0 Aprovado -5.832998 -35.242542 3.451075
1 4 3 59088150 2014 1 ATIVO 2014 1 3.0 0 Reprovado -5.872282 -35.206600 4.027258
2 5 3 59088150 2014 1 ATIVO 2014 2 7.2 0 Aprovado -5.872282 -35.206600 4.027258
3 8 5 59112430 2014 1 ATIVO 2014 2 8.9 0 Aprovado -5.750546 -35.258952 10.830276
4 12 7 59020330 2014 1 ATIVO 2014 2 5.6 0 Aprovado -5.798325 -35.202731 4.301033

Determinar Correlação entre média de notas e distância até a UF

Covariância -> (Σ[(xi-xmed)*(yi-ymed)])/n-1

Variância de cada variável (X,Y) -> Σ(xi-xmed)² e Σ(yi-ymed)²

Correlação -> Cov(X,Y)/[√Va(X) * √Va(Y)]


In [44]:
#Considera apenas as aprovações -> Validação 1
distancias = distancias[distancias["status.disciplina"] == 'Aprovado']

#Removendo distâncias desnecessárias -> Validação 2 e 3
distancias = distancias[distancias['KM'] != 0]
distancias = distancias[distancias['CEP'] != 0]

#Validação 4
distancias = distancias[distancias['KM'] < 30]

distancias


Out[44]:
index a_ID CEP ano_ingresso periodo_ingresso status ano_disciplina periodo_disciplina nota disciplina_ID status.disciplina LAT LON KM
0 3 2 59072580 2014 1 ATIVO 2014 2 6.1 0 Aprovado -5.832998 -35.242542 3.451075
2 5 3 59088150 2014 1 ATIVO 2014 2 7.2 0 Aprovado -5.872282 -35.206600 4.027258
3 8 5 59112430 2014 1 ATIVO 2014 2 8.9 0 Aprovado -5.750546 -35.258952 10.830276
4 12 7 59020330 2014 1 ATIVO 2014 2 5.6 0 Aprovado -5.798325 -35.202731 4.301033
12 28 15 59151800 2014 1 ATIVO 2014 2 8.5 0 Aprovado -5.890733 -35.196979 6.242204
14 36 20 59090620 2014 1 ATIVO 2014 2 6.4 0 Aprovado -5.885479 -35.168988 7.204428
16 39 23 59070750 2014 2 ATIVO 2015 1 8.9 0 Aprovado -5.833169 -35.231361 2.220069
17 41 25 59115685 2014 1 ATIVO 2016 1 7.3 0 Aprovado -5.744671 -35.271933 12.132286
20 44 26 59290000 2014 1 ATIVO 2015 2 6.6 0 Aprovado -5.776116 -35.368568 18.616898
21 45 27 59040240 2014 1 ATIVO 2014 1 5.9 0 Aprovado -5.793517 -35.223136 4.891914
22 49 30 59094300 2014 1 ATIVO 2014 2 6.8 0 Aprovado -5.871294 -35.184497 4.902081
23 50 31 59020400 2014 1 ATIVO 2014 2 8.4 0 Aprovado -5.787405 -35.201232 5.515917
24 52 33 59122495 2014 1 ATIVO 2014 2 7.5 0 Aprovado -5.750008 -35.225788 9.661615
31 62 38 59124090 2014 1 ATIVO 2014 2 7.9 0 Aprovado -5.752227 -35.245387 10.014439
32 63 39 59090310 2014 1 ATIVO 2014 2 6.0 0 Aprovado -5.888454 -35.169960 7.388841
33 65 41 59151250 2014 1 ATIVO 2014 2 7.7 0 Aprovado -5.873118 -35.215692 4.107965
37 76 47 59088310 2014 1 ATIVO 2016 1 5.6 0 Aprovado -5.873927 -35.204509 4.243846
40 85 54 59037390 2014 1 ATIVO 2015 1 7.9 0 Aprovado -5.805039 -35.221957 3.634193
41 86 55 59082070 2014 1 ATIVO 2014 2 7.1 0 Aprovado -5.857290 -35.200163 2.650718
43 89 57 59149300 2014 1 ATIVO 2015 1 5.9 0 Aprovado -5.882702 -35.245286 6.356414
44 92 60 59148630 2014 1 ATIVO 2014 2 8.7 0 Aprovado -5.879199 -35.236217 5.483844
45 93 61 59151400 2014 1 ATIVO 2014 2 5.1 0 Aprovado -5.882595 -35.207194 5.152835
46 94 61 59151400 2014 1 ATIVO 2015 1 8.1 0 Aprovado -5.882595 -35.207194 5.152835
47 95 62 59090500 2014 1 ATIVO 2014 2 8.1 0 Aprovado -5.884906 -35.178427 6.515956
48 96 63 59115555 2014 1 ATIVO 2014 2 9.9 0 Aprovado -5.750124 -35.277156 11.975802
50 98 64 59141300 2014 1 ATIVO 2015 1 6.1 0 Aprovado -5.921855 -35.278032 11.996945
53 107 70 59022500 2014 1 ATIVO 2014 1 9.2 0 Aprovado -5.800558 -35.202689 4.062042
55 109 71 59062530 2014 1 ATIVO 2014 2 7.1 0 Aprovado -5.821045 -35.218215 1.831901
56 110 72 59054220 2014 1 ATIVO 2014 2 7.8 0 Aprovado -5.814371 -35.214206 2.432275
60 120 80 59290000 2014 1 ATIVO 2014 2 7.4 0 Aprovado -5.776116 -35.368568 18.616898
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2430 3639 627 59080480 2016 1 ATIVO 2016 2 9.5 0 Aprovado -5.860358 -35.209459 2.681085
2432 3641 629 59020130 2016 1 ATIVO 2016 2 7.1 0 Aprovado -5.789681 -35.205443 5.188853
2434 3645 633 59022530 2016 1 ATIVO 2016 2 7.5 0 Aprovado -5.803891 -35.205768 3.630148
2442 3655 642 59147495 2016 1 ATIVO 2016 2 7.7 0 Aprovado -5.901868 -35.271461 9.836399
2443 3657 644 59080115 2016 1 ATIVO 2016 1 5.5 0 Aprovado -5.854938 -35.208017 2.108250
2448 3662 649 59088460 2016 1 ATIVO 2016 1 5.7 0 Aprovado -5.874749 -35.208155 4.279108
2451 3666 653 59054375 2016 1 ATIVO 2016 2 5.9 0 Aprovado -5.808578 -35.221974 3.265997
2453 3668 655 59150600 2016 1 ATIVO 2016 2 9.2 0 Aprovado -5.880031 -35.219560 4.927353
2455 3670 657 59020030 2016 1 ATIVO 2016 2 7.4 0 Aprovado -5.783906 -35.200545 5.910259
2456 3671 658 59030490 2016 1 ATIVO 2016 1 9.9 0 Aprovado -5.797771 -35.207870 4.269451
2457 3674 660 59080115 2016 1 ATIVO 2016 2 6.8 0 Aprovado -5.854938 -35.208017 2.108250
2459 3676 662 59136160 2016 1 ATIVO 2016 2 8.2 0 Aprovado -5.723189 -35.248523 13.151760
2461 3678 664 59149150 2016 1 ATIVO 2016 2 9.0 0 Aprovado -5.883695 -35.250041 6.764416
2465 3683 669 59151550 2016 1 ATIVO 2016 2 7.2 0 Aprovado -5.886822 -35.201364 5.710113
2467 3685 671 59062560 2016 1 ATIVO 2016 1 7.2 0 Aprovado -5.823277 -35.218246 1.610595
2469 3687 673 59022020 2016 1 ATIVO 2016 2 8.6 0 Aprovado -5.794253 -35.205981 4.679806
2470 3688 674 59086040 2016 1 ATIVO 2016 2 7.8 0 Aprovado -5.862964 -35.208036 2.984729
2472 3690 572 59082260 2016 1 ATIVO 2016 1 9.0 1 Aprovado -5.869121 -35.188288 4.459189
2473 3691 575 59054620 2016 1 ATIVO 2016 1 8.4 1 Aprovado -5.810181 -35.215076 2.904157
2477 3698 578 59149323 2016 1 ATIVO 2016 1 6.6 1 Aprovado -5.880711 -35.248092 6.372321
2480 3701 580 59112430 2016 1 ATIVO 2016 1 7.6 1 Aprovado -5.750546 -35.258952 10.830276
2481 3702 581 59152820 2016 1 ATIVO 2016 1 8.3 1 Aprovado -5.887114 -35.213128 5.632735
2484 3707 582 59290000 2016 1 ATIVO 2016 1 6.1 1 Aprovado -5.776116 -35.368568 18.616898
2486 3709 685 59064500 2016 1 ATIVO 2016 2 7.6 1 Aprovado -5.823571 -35.220052 1.684988
2489 3712 687 59162000 2016 1 ATIVO 2016 1 7.7 1 Aprovado -6.030114 -35.280762 22.772292
2491 3714 689 59147540 2016 1 ATIVO 2016 1 5.5 1 Aprovado -5.902774 -35.270289 9.824196
2492 3715 585 59146810 2016 1 ATIVO 2016 1 8.0 1 Aprovado -5.899329 -35.269282 9.466405
2494 3718 692 59054680 2016 1 ATIVO 2016 1 7.7 1 Aprovado -5.811699 -35.213294 2.716777
2495 3721 695 59074368 2016 1 ATIVO 2016 1 5.0 1 Aprovado -5.828887 -35.257980 5.205823
2496 3722 696 59150666 2016 1 ATIVO 2016 1 8.2 1 Aprovado -5.881965 -35.216787 5.093792

1056 rows × 14 columns


In [72]:
#Lista com ID dos alunos validados
alunos_validos = distancias.a_ID.unique()
alunos_validos


Out[72]:
array([  2,   3,   5,   7,   8,  12,  14,  15,  16,  20,  21,  23,  25,
        26,  27,  30,  31,  33,  34,  37,  38,  39,  41,  42,  45,  47,
        48,  54,  55,  57,  60,  61,  62,  63,  64,  69,  70,  71,  72,
        73,  74,  77,  80,  83,  86,  87,  90,  91,  92,  93,  95,  98,
       101, 102, 103, 108, 110, 111, 112, 115, 116, 117, 120, 121, 123,
       125, 128, 129, 130, 131, 132, 133, 134, 136, 137, 139, 145, 146,
       147, 150, 152, 153, 154, 155, 156, 157, 158, 160, 162, 165, 166,
       188, 196, 197, 198, 201, 204, 208, 210, 218, 220, 228, 236, 237,
       238, 240, 242, 243, 245, 249, 253, 257, 259, 260, 261, 262, 263,
       264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 275, 276, 277,
       278, 279, 280, 281, 282, 284, 285, 286, 287, 289, 290, 291, 292,
       293, 294, 295, 296, 297, 300, 301, 302, 303, 304, 306, 307, 308,
       309, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323,
       324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 337,
       338, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 351, 353,
       354, 355, 356, 357, 359, 361, 362, 363, 365, 366, 368, 370, 371,
       372, 373, 374, 377, 378, 380, 381, 383, 386, 387, 388, 389, 390,
       391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 403, 405, 406,
       407, 408, 409, 410, 411, 412, 414, 416, 417, 418, 419, 423, 424,
       425, 426, 427, 429, 430, 431, 432, 434, 435, 437, 438, 439, 440,
       441, 442, 444, 445, 447, 448, 449, 450, 451, 452, 453, 454, 455,
       456, 457, 458, 459, 461, 463, 464, 465, 466, 467, 468, 470, 475,
       478, 479, 481, 489, 492, 494, 495, 496, 497, 501, 505, 506, 509,
       513, 517, 520, 522, 525, 216, 533, 535, 539, 540, 541, 545, 547,
       548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 560, 561,
       562, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575,
       576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588,
       589, 590, 591, 592, 593, 594, 595, 596, 598, 599, 601, 602, 603,
       604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616,
       617, 193, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628,
       629, 632, 633, 634, 635, 636, 638, 639, 640, 642, 644, 645, 646,
       647, 648, 649, 650, 652, 653, 654, 655, 656, 657, 658, 660, 661,
       662, 663, 664, 665, 667, 668, 669, 670, 671, 672, 673, 674, 675,
       679, 680, 681, 682, 685, 686, 687, 688, 689, 691, 692, 695, 696,
       697, 698, 699, 700, 702, 704, 705, 706, 707, 708, 709, 710, 711,
       712, 713, 714, 718, 719, 722, 724, 725, 726, 729, 730, 731, 732,
       734, 735, 737, 738, 739, 740, 742, 743, 744, 745, 748, 749, 750,
       751, 752, 753, 754, 755, 756,  82, 757, 758, 759, 761, 762, 763,
       764, 765, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 780,
       785, 787, 788, 789, 790, 791, 792, 793, 795, 796, 797, 798, 799,
       800, 802, 805, 806, 807, 808, 809, 810, 811, 812, 814, 815, 816,
       817, 819, 820, 823, 824, 828, 830, 831, 832, 833, 834, 836, 837,
       838, 840, 841, 842, 843, 845, 847, 849, 850, 852, 853, 856, 857,
       858, 860, 861, 864, 866, 867, 868, 869, 870, 871, 872, 873, 874,
       877, 879, 880, 881, 885, 887, 888, 889, 890, 891, 892, 893, 894,
       895, 896, 897, 898])

Ocorrência dos dados


In [73]:
#Valores 'chave' da análise
valor_x_list = []
valor_y_list = []
for i in range(len(alunos_validos)):
    
    #Seleciona todas as ocorrências do aluno com aquele ID
    aluno = distancias[distancias["a_ID"] == alunos_validos[i]]    
    
    #Calcula média das notas para aquele aluno
    media =  aluno['nota'].mean()
    distancia_UF = aluno['KM'].mean()
    
    #Adiciona resultado à lista de distribuição X 
    if( media <= 6.0 ):
        valor_x_list.append(0)
    elif( media > 6.0 and media <= 7.0 ):
        valor_x_list.append(1)
    elif( media > 7.0 and media <= 8.0 ):
        valor_x_list.append(2)
    else:
        valor_x_list.append(3)
        
    #Adiciona resultado à lista de distribuição Y 
    if( distancia_UF <= 1.5 ):
        valor_y_list.append(0)
    elif( distancia_UF > 1.5 and distancia_UF <= 4.0 ):
        valor_y_list.append(1)
    elif( distancia_UF > 4.0 and distancia_UF <= 8.0 ):
        valor_y_list.append(2)
    else:
        valor_y_list.append(3)

Tabela de Ocorrências


In [74]:
#Tabela de Ocorrências
distribuicao = pd.DataFrame(columns=('Valor X_Nota', 'Valor Y_Distancia', 'Xi - Xmed', 'Yi - Ymed'
                                     , 'Prod', '(Xi - Xmed)^2', '(Yi - Ymed)^2' ) )

distribuicao["Valor X_Nota"] = valor_x_list
distribuicao["Valor Y_Distancia"] = valor_y_list

distribuicao
#distancias["LAT"], distancias["LON"], distancias["KM"] = [0,0,0]


Out[74]:
Valor X_Nota Valor Y_Distancia Xi - Xmed Yi - Ymed Prod (Xi - Xmed)^2 (Yi - Ymed)^2
0 0 1 NaN NaN NaN NaN NaN
1 0 2 NaN NaN NaN NaN NaN
2 2 3 NaN NaN NaN NaN NaN
3 0 2 NaN NaN NaN NaN NaN
4 1 0 NaN NaN NaN NaN NaN
5 0 3 NaN NaN NaN NaN NaN
6 0 0 NaN NaN NaN NaN NaN
7 3 2 NaN NaN NaN NaN NaN
8 2 3 NaN NaN NaN NaN NaN
9 0 2 NaN NaN NaN NaN NaN
10 3 3 NaN NaN NaN NaN NaN
11 0 1 NaN NaN NaN NaN NaN
12 0 3 NaN NaN NaN NaN NaN
13 0 3 NaN NaN NaN NaN NaN
14 2 2 NaN NaN NaN NaN NaN
15 0 2 NaN NaN NaN NaN NaN
16 3 2 NaN NaN NaN NaN NaN
17 2 3 NaN NaN NaN NaN NaN
18 0 3 NaN NaN NaN NaN NaN
19 0 2 NaN NaN NaN NaN NaN
20 1 3 NaN NaN NaN NaN NaN
21 0 2 NaN NaN NaN NaN NaN
22 2 2 NaN NaN NaN NaN NaN
23 0 3 NaN NaN NaN NaN NaN
24 2 3 NaN NaN NaN NaN NaN
25 1 2 NaN NaN NaN NaN NaN
26 0 0 NaN NaN NaN NaN NaN
27 0 1 NaN NaN NaN NaN NaN
28 1 1 NaN NaN NaN NaN NaN
29 0 2 NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ...
559 0 0 NaN NaN NaN NaN NaN
560 0 0 NaN NaN NaN NaN NaN
561 0 0 NaN NaN NaN NaN NaN
562 0 0 NaN NaN NaN NaN NaN
563 0 0 NaN NaN NaN NaN NaN
564 2 0 NaN NaN NaN NaN NaN
565 0 0 NaN NaN NaN NaN NaN
566 0 0 NaN NaN NaN NaN NaN
567 0 0 NaN NaN NaN NaN NaN
568 0 0 NaN NaN NaN NaN NaN
569 1 0 NaN NaN NaN NaN NaN
570 0 0 NaN NaN NaN NaN NaN
571 0 0 NaN NaN NaN NaN NaN
572 0 0 NaN NaN NaN NaN NaN
573 0 0 NaN NaN NaN NaN NaN
574 2 0 NaN NaN NaN NaN NaN
575 0 0 NaN NaN NaN NaN NaN
576 2 0 NaN NaN NaN NaN NaN
577 0 0 NaN NaN NaN NaN NaN
578 0 0 NaN NaN NaN NaN NaN
579 0 0 NaN NaN NaN NaN NaN
580 0 0 NaN NaN NaN NaN NaN
581 0 0 NaN NaN NaN NaN NaN
582 0 0 NaN NaN NaN NaN NaN
583 0 0 NaN NaN NaN NaN NaN
584 0 0 NaN NaN NaN NaN NaN
585 0 0 NaN NaN NaN NaN NaN
586 1 0 NaN NaN NaN NaN NaN
587 0 0 NaN NaN NaN NaN NaN
588 0 0 NaN NaN NaN NaN NaN

589 rows × 7 columns

Cálculo do Xmed e Ymed (Valor médio de todos os pontos de X e Y)


In [75]:
xmed =  distribuicao['Valor X_Nota'].mean()
ymed =  distribuicao['Valor Y_Distancia'].mean()

Cálculo (Xi - Xmed) e (Yi - Ymed) i = índice dos valores de X e Y


In [76]:
dif_X = []
dif_Y = []
prod = []

for i in range(len(alunos_validos)):
    
    #Calcula diferença para cada valor de X e salva numa lista
    difX = valor_x_list[i] - xmed
    
    dif_X.append(difX)
    
    #Calcula diferença para cada valor de Y e salva numa lista
    difY = valor_y_list[i] - ymed
    
    dif_Y.append(difY)
    
    #Calcula produto entre valores
    prod_Difs = difX*difY
    
    prod.append(prod_Difs)
    
#Adiciona na tabela
distribuicao["Xi - Xmed"] = dif_X
distribuicao["Yi - Ymed"] = dif_Y
distribuicao["Prod"] = prod

In [53]:
distribuicao


Out[53]:
Valor X_Nota Valor Y_Distancia Xi - Xmed Yi - Ymed Prod (Xi - Xmed)^2 (Yi - Ymed)^2
0 1 1 -0.445283 -0.973585 0.433521 NaN NaN
1 1 2 -0.445283 0.026415 -0.011762 NaN NaN
2 2 3 0.554717 1.026415 0.569370 NaN NaN
3 1 2 -0.445283 0.026415 -0.011762 NaN NaN
4 3 2 1.554717 0.026415 0.041068 NaN NaN
5 1 2 -0.445283 0.026415 -0.011762 NaN NaN
6 2 1 0.554717 -0.973585 -0.540064 NaN NaN
7 1 3 -0.445283 1.026415 -0.457045 NaN NaN
8 0 3 -1.445283 1.026415 -1.483460 NaN NaN
9 2 2 0.554717 0.026415 0.014653 NaN NaN
10 1 2 -0.445283 0.026415 -0.011762 NaN NaN
11 3 2 1.554717 0.026415 0.041068 NaN NaN
12 2 3 0.554717 1.026415 0.569370 NaN NaN
13 2 3 0.554717 1.026415 0.569370 NaN NaN
14 1 2 -0.445283 0.026415 -0.011762 NaN NaN
15 2 2 0.554717 0.026415 0.014653 NaN NaN
16 1 2 -0.445283 0.026415 -0.011762 NaN NaN
17 1 1 -0.445283 -0.973585 0.433521 NaN NaN
18 1 1 -0.445283 -0.973585 0.433521 NaN NaN
19 1 2 -0.445283 0.026415 -0.011762 NaN NaN
20 2 2 0.554717 0.026415 0.014653 NaN NaN
21 1 2 -0.445283 0.026415 -0.011762 NaN NaN
22 2 2 0.554717 0.026415 0.014653 NaN NaN
23 3 3 1.554717 1.026415 1.595785 NaN NaN
24 0 3 -1.445283 1.026415 -1.483460 NaN NaN
25 2 2 0.554717 0.026415 0.014653 NaN NaN
26 0 1 -1.445283 -0.973585 1.407106 NaN NaN
27 2 1 0.554717 -0.973585 -0.540064 NaN NaN
28 0 3 -1.445283 1.026415 -1.483460 NaN NaN
29 1 2 -0.445283 0.026415 -0.011762 NaN NaN
... ... ... ... ... ... ... ...
235 3 3 1.554717 1.026415 1.595785 NaN NaN
236 0 2 -1.445283 0.026415 -0.038177 NaN NaN
237 0 3 -1.445283 1.026415 -1.483460 NaN NaN
238 3 3 1.554717 1.026415 1.595785 NaN NaN
239 3 1 1.554717 -0.973585 -1.513649 NaN NaN
240 2 2 0.554717 0.026415 0.014653 NaN NaN
241 2 1 0.554717 -0.973585 -0.540064 NaN NaN
242 2 3 0.554717 1.026415 0.569370 NaN NaN
243 0 1 -1.445283 -0.973585 1.407106 NaN NaN
244 0 2 -1.445283 0.026415 -0.038177 NaN NaN
245 0 1 -1.445283 -0.973585 1.407106 NaN NaN
246 3 2 1.554717 0.026415 0.041068 NaN NaN
247 2 2 0.554717 0.026415 0.014653 NaN NaN
248 3 2 1.554717 0.026415 0.041068 NaN NaN
249 1 1 -0.445283 -0.973585 0.433521 NaN NaN
250 3 3 1.554717 1.026415 1.595785 NaN NaN
251 3 2 1.554717 0.026415 0.041068 NaN NaN
252 2 2 0.554717 0.026415 0.014653 NaN NaN
253 2 1 0.554717 -0.973585 -0.540064 NaN NaN
254 3 2 1.554717 0.026415 0.041068 NaN NaN
255 2 1 0.554717 -0.973585 -0.540064 NaN NaN
256 1 2 -0.445283 0.026415 -0.011762 NaN NaN
257 2 3 0.554717 1.026415 0.569370 NaN NaN
258 3 2 1.554717 0.026415 0.041068 NaN NaN
259 2 1 0.554717 -0.973585 -0.540064 NaN NaN
260 2 3 0.554717 1.026415 0.569370 NaN NaN
261 0 3 -1.445283 1.026415 -1.483460 NaN NaN
262 2 1 0.554717 -0.973585 -0.540064 NaN NaN
263 0 2 -1.445283 0.026415 -0.038177 NaN NaN
264 3 2 1.554717 0.026415 0.041068 NaN NaN

265 rows × 7 columns

Calcula a covariância


In [77]:
passo1 = distribuicao['Prod'].sum()

covXY = passo1/(len(alunos_validos)-1)
covXY


Out[77]:
0.1940652322049366

In [78]:
dif_X_quadrada = []
dif_Y_quadrada = []

for i in range(len(alunos_validos)):
    
    #Eleva cada diferença ao quadrado
    difX_quad = dif_X[i]*dif_X[i]
    
    dif_X_quadrada.append(difX_quad)
    
    #Eleva cada diferença ao quadrado
    difY_quad = dif_Y[i]*dif_Y[i]
    
    dif_Y_quadrada.append(difY_quad)
    
#Adiciona na tabela
distribuicao["(Xi - Xmed)^2"] = dif_X_quadrada
distribuicao["(Yi - Ymed)^2"] = dif_Y_quadrada

In [79]:
distribuicao


Out[79]:
Valor X_Nota Valor Y_Distancia Xi - Xmed Yi - Ymed Prod (Xi - Xmed)^2 (Yi - Ymed)^2
0 0 1 -0.573854 -0.42275 0.242597 0.329308 0.178718
1 0 2 -0.573854 0.57725 -0.331257 0.329308 0.333217
2 2 3 1.426146 1.57725 2.249388 2.033892 2.487716
3 0 2 -0.573854 0.57725 -0.331257 0.329308 0.333217
4 1 0 0.426146 -1.42275 -0.606299 0.181600 2.024219
5 0 3 -0.573854 1.57725 -0.905111 0.329308 2.487716
6 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
7 3 2 2.426146 0.57725 1.400492 5.886184 0.333217
8 2 3 1.426146 1.57725 2.249388 2.033892 2.487716
9 0 2 -0.573854 0.57725 -0.331257 0.329308 0.333217
10 3 3 2.426146 1.57725 3.826638 5.886184 2.487716
11 0 1 -0.573854 -0.42275 0.242597 0.329308 0.178718
12 0 3 -0.573854 1.57725 -0.905111 0.329308 2.487716
13 0 3 -0.573854 1.57725 -0.905111 0.329308 2.487716
14 2 2 1.426146 0.57725 0.823242 2.033892 0.333217
15 0 2 -0.573854 0.57725 -0.331257 0.329308 0.333217
16 3 2 2.426146 0.57725 1.400492 5.886184 0.333217
17 2 3 1.426146 1.57725 2.249388 2.033892 2.487716
18 0 3 -0.573854 1.57725 -0.905111 0.329308 2.487716
19 0 2 -0.573854 0.57725 -0.331257 0.329308 0.333217
20 1 3 0.426146 1.57725 0.672139 0.181600 2.487716
21 0 2 -0.573854 0.57725 -0.331257 0.329308 0.333217
22 2 2 1.426146 0.57725 0.823242 2.033892 0.333217
23 0 3 -0.573854 1.57725 -0.905111 0.329308 2.487716
24 2 3 1.426146 1.57725 2.249388 2.033892 2.487716
25 1 2 0.426146 0.57725 0.245993 0.181600 0.333217
26 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
27 0 1 -0.573854 -0.42275 0.242597 0.329308 0.178718
28 1 1 0.426146 -0.42275 -0.180153 0.181600 0.178718
29 0 2 -0.573854 0.57725 -0.331257 0.329308 0.333217
... ... ... ... ... ... ... ...
559 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
560 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
561 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
562 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
563 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
564 2 0 1.426146 -1.42275 -2.029050 2.033892 2.024219
565 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
566 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
567 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
568 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
569 1 0 0.426146 -1.42275 -0.606299 0.181600 2.024219
570 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
571 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
572 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
573 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
574 2 0 1.426146 -1.42275 -2.029050 2.033892 2.024219
575 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
576 2 0 1.426146 -1.42275 -2.029050 2.033892 2.024219
577 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
578 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
579 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
580 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
581 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
582 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
583 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
584 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
585 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
586 1 0 0.426146 -1.42275 -0.606299 0.181600 2.024219
587 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219
588 0 0 -0.573854 -1.42275 0.816451 0.329308 2.024219

589 rows × 7 columns

Calcula Variância de X e Y


In [80]:
passo21 = distribuicao['(Xi - Xmed)^2'].sum()
passo22 = distribuicao['(Yi - Ymed)^2'].sum()

#Variância de X
varX = passo21/(len(alunos_validos)-1)

#Variância de Y
varY = passo22/(len(alunos_validos)-1)

In [81]:
varX


Out[81]:
0.8708118221821896

In [82]:
varY


Out[82]:
1.7138352794428442

Executar passo 3

Calcular Correlação


In [83]:
from math import sqrt

desvio_padraoX = sqrt(varX)
desvio_padraoY = sqrt(varY)

#Calculo da correlação
corrXY = covXY/(desvio_padraoX*desvio_padraoY)

corrXY


Out[83]:
0.15885505481053944

In [ ]: